sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.5.1  magrittr_1.5    tools_3.5.1     htmltools_0.3.6
##  [5] yaml_2.2.0      Rcpp_1.0.0      stringi_1.2.4   rmarkdown_1.11 
##  [9] knitr_1.20      stringr_1.3.1   digest_0.6.18   evaluate_0.12

User Inputs

output.var = params$output.var
log.pred = params$log.pred
eda = params$eda
algo.forward = params$algo.forward
algo.backward = params$algo.backward
algo.stepwise = params$algo.stepwise
algo.LASSO = params$algo.LASSO
algo.LARS = params$algo.LARS

message("Parameters used for training/prediction: ")
## Parameters used for training/prediction:
str(params)
## List of 8
##  $ output.var   : chr "y3"
##  $ log.pred     : logi FALSE
##  $ eda          : logi TRUE
##  $ algo.forward : logi FALSE
##  $ algo.backward: logi FALSE
##  $ algo.stepwise: logi FALSE
##  $ algo.LASSO   : logi FALSE
##  $ algo.LARS    : logi FALSE
# Setup Labels
# alt.scale.label.name = Alternate Scale variable name
#   - if predicting on log, then alt.scale is normal scale
#   - if predicting on normal scale, then alt.scale is log scale
if (log.pred == TRUE){
  label.names = paste('log.',output.var,sep="")
  alt.scale.label.name = output.var
}
if (log.pred == FALSE){
  label.names = output.var
  alt.scale.label.name = paste('log.',output.var,sep="")
}

Prepare Data

Read and Clean Features

features = read.csv("../../Data/features.csv")
#str(features) 

Checking correlations to evaluate removal of redundant features

corr.matrix = round(cor(features[sapply(features, is.numeric)]),2)

# filter out only highly correlated variables
threshold = 0.6
corr.matrix.tmp = corr.matrix
diag(corr.matrix.tmp) = 0
high.corr = apply(abs(corr.matrix.tmp) >= threshold, 1, any)
high.corr.matrix = corr.matrix.tmp[high.corr, high.corr]

DT::datatable(corr.matrix)
DT::datatable(high.corr.matrix)

Feature Names

feature.names = colnames(features)
drops <- c('JobName')
feature.names = feature.names[!(feature.names %in% drops)]
#str(feature.names)

Read and Clean Labels

labels = read.csv("../../Data/labels.csv")
#str(labels)
labels = labels[,c("JobName", output.var)]
summary(labels)
##       JobName           y3        
##  Job_00001:   1   Min.   : 95.91  
##  Job_00002:   1   1st Qu.:118.21  
##  Job_00003:   1   Median :123.99  
##  Job_00004:   1   Mean   :125.36  
##  Job_00005:   1   3rd Qu.:131.06  
##  Job_00006:   1   Max.   :193.73  
##  (Other)  :9994   NA's   :2497

Merge Datasets

data <- merge(features, labels, by = 'JobName')
drops <- c('JobName')
data = data[,(!colnames(data) %in% drops)]
#str(data)

Transformations

#str(data)
if (log.pred == TRUE){
  data[label.names] = log(data[alt.scale.label.name],10)
  
  drops = c(alt.scale.label.name)
  data = data[!(names(data) %in% drops)]
}
#str(data)

Remove NA Cases

data = data[complete.cases(data),]

Check correlation of Label with Featires

if (eda == TRUE){
  corr.to.label =round(cor(dplyr::select(data,-one_of(label.names)),dplyr::select_at(data,label.names)),4)
  DT::datatable(corr.to.label)
}

Multicollinearity - VIF

if (eda == TRUE){
  vifDF = usdm::vif(select_at(data,feature.names)) %>% arrange(desc(VIF))
  head(vifDF,10)
}
##    Variables      VIF
## 1     stat31 1.066774
## 2    stat113 1.059555
## 3     stat98 1.059344
## 4    stat105 1.059191
## 5        x22 1.058846
## 6    stat206 1.058561
## 7    stat178 1.058379
## 8    stat179 1.058364
## 9    stat142 1.058288
## 10   stat171 1.057939

Exploratory Data Analysis

Scatterplots

panel.hist <- function(x, ...)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(usr[1:2], 0, 1.5) )
    h <- hist(x, plot = FALSE)
    breaks <- h$breaks; nB <- length(breaks)
    y <- h$counts; y <- y/max(y)
    rect(breaks[-nB], 0, breaks[-1], y, col = "cyan", ...)
}
if (eda == TRUE){
  hist(data[ ,label.names])
  #hist(data[complete.cases(data),alt.scale.label.name])
}

# https://stackoverflow.com/questions/24648729/plot-one-numeric-variable-against-n-numeric-variables-in-n-plots
ind.pairs.plot <- function(data, xvars=NULL, yvar)
{
    df <- data
    if (is.null(xvars)) {
        xvars = names(data[which(names(data)!=yvar)])       
    }   

    #choose a format to display charts
    ncharts <- length(xvars) 
    
    for(i in 1:ncharts){    
        plot(df[,xvars[i]],df[,yvar], xlab = xvars[i], ylab = yvar)
    }
}

if (eda == TRUE){
  ind.pairs.plot(data, feature.names, label.names)
}

Feature Engineering

if(eda ==FALSE){
  # x18 may need transformations
  plot(data[,'x18'], data[,label.names], main = "Original Scatter Plot vs. x18", ylab = label.names, xlab = 'x18')
  plot(sqrt(data[,'x18']), data[,label.names], main = "Original Scatter Plot vs. sqrt(x18)", ylab = label.names, xlab = 'sqrt(x18)')
  
  # transforming x18
  data$sqrt.x18 = sqrt(data$x18)
  data = dplyr::select(data,-one_of('x18'))
  
  # what about x7, x9?
  # x11 looks like data is at discrete points after a while. Will this be a problem?
}

Modeling

Train Test Split

data = data[sample(nrow(data)),] # randomly shuffle data
split = sample.split(data[,label.names], SplitRatio = 0.8)

data.train = subset(data, split == TRUE)
data.test = subset(data, split == FALSE)

Common Functions

plot.diagnostics <-  function(model, train) {
  plot(model)
  
  residuals = resid(model) # Plotted above in plot(lm.out)
  r.standard = rstandard(model)
  r.student = rstudent(model)

  plot(predict(model,train),r.student,
      ylab="Student Residuals", xlab="Predicted Values", 
      main="Student Residual Plot") 
  abline(0, 0)
  
  plot(predict(model, train),r.standard,
      ylab="Standard Residuals", xlab="Predicted Values", 
      main="Standard Residual Plot") 
  abline(0, 0)
  abline(2, 0)
  abline(-2, 0)
  
  # Histogram
  hist(r.student, freq=FALSE, main="Distribution of Studentized Residuals", 
  xlab="Studentized Residuals", ylab="Density", ylim=c(0,0.5))

  # Create range of x-values for normal curve
  xfit <- seq(min(r.student)-1, max(r.student)+1, length=40)

  # Generate values from the normal distribution at the specified values
  yfit <- (dnorm(xfit))

  # Add the normal curve
  lines(xfit, yfit, ylim=c(0,0.5))
  
}

Setup Formulae

n <- names(data.train)
formula <- as.formula(paste(paste(n[n %in% label.names], collapse = " + ")," ~", paste(n[!n %in% label.names], collapse = " + "))) 
grand.mean.formula = as.formula(paste(paste(n[n %in% label.names], collapse = " + ")," ~ 1"))
print(formula)
## y3 ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + x10 + x11 + 
##     x12 + x13 + x14 + x15 + x16 + x17 + x18 + x19 + x20 + x21 + 
##     x22 + x23 + stat1 + stat2 + stat3 + stat4 + stat5 + stat6 + 
##     stat7 + stat8 + stat9 + stat10 + stat11 + stat12 + stat13 + 
##     stat14 + stat15 + stat16 + stat17 + stat18 + stat19 + stat20 + 
##     stat21 + stat22 + stat23 + stat24 + stat25 + stat26 + stat27 + 
##     stat28 + stat29 + stat30 + stat31 + stat32 + stat33 + stat34 + 
##     stat35 + stat36 + stat37 + stat38 + stat39 + stat40 + stat41 + 
##     stat42 + stat43 + stat44 + stat45 + stat46 + stat47 + stat48 + 
##     stat49 + stat50 + stat51 + stat52 + stat53 + stat54 + stat55 + 
##     stat56 + stat57 + stat58 + stat59 + stat60 + stat61 + stat62 + 
##     stat63 + stat64 + stat65 + stat66 + stat67 + stat68 + stat69 + 
##     stat70 + stat71 + stat72 + stat73 + stat74 + stat75 + stat76 + 
##     stat77 + stat78 + stat79 + stat80 + stat81 + stat82 + stat83 + 
##     stat84 + stat85 + stat86 + stat87 + stat88 + stat89 + stat90 + 
##     stat91 + stat92 + stat93 + stat94 + stat95 + stat96 + stat97 + 
##     stat98 + stat99 + stat100 + stat101 + stat102 + stat103 + 
##     stat104 + stat105 + stat106 + stat107 + stat108 + stat109 + 
##     stat110 + stat111 + stat112 + stat113 + stat114 + stat115 + 
##     stat116 + stat117 + stat118 + stat119 + stat120 + stat121 + 
##     stat122 + stat123 + stat124 + stat125 + stat126 + stat127 + 
##     stat128 + stat129 + stat130 + stat131 + stat132 + stat133 + 
##     stat134 + stat135 + stat136 + stat137 + stat138 + stat139 + 
##     stat140 + stat141 + stat142 + stat143 + stat144 + stat145 + 
##     stat146 + stat147 + stat148 + stat149 + stat150 + stat151 + 
##     stat152 + stat153 + stat154 + stat155 + stat156 + stat157 + 
##     stat158 + stat159 + stat160 + stat161 + stat162 + stat163 + 
##     stat164 + stat165 + stat166 + stat167 + stat168 + stat169 + 
##     stat170 + stat171 + stat172 + stat173 + stat174 + stat175 + 
##     stat176 + stat177 + stat178 + stat179 + stat180 + stat181 + 
##     stat182 + stat183 + stat184 + stat185 + stat186 + stat187 + 
##     stat188 + stat189 + stat190 + stat191 + stat192 + stat193 + 
##     stat194 + stat195 + stat196 + stat197 + stat198 + stat199 + 
##     stat200 + stat201 + stat202 + stat203 + stat204 + stat205 + 
##     stat206 + stat207 + stat208 + stat209 + stat210 + stat211 + 
##     stat212 + stat213 + stat214 + stat215 + stat216 + stat217
print(grand.mean.formula)
## y3 ~ 1
# Update feature.names because we may have transformed some features
feature.names = n[!n %in% label.names]

Full & Grand Means Model

model.full = lm(formula , data.train)
summary(model.full)
## 
## Call:
## lm(formula = formula, data = data.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.422  -6.067  -1.711   4.532  55.809 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.453e+01  2.722e+00  34.732  < 2e-16 ***
## x1          -1.330e-01  1.903e-01  -0.699 0.484468    
## x2           4.795e-02  1.212e-01   0.395 0.692526    
## x3          -5.789e-03  3.325e-02  -0.174 0.861791    
## x4          -1.320e-02  2.636e-03  -5.008 5.67e-07 ***
## x5           1.534e-01  8.609e-02   1.782 0.074860 .  
## x6           2.002e-01  1.730e-01   1.157 0.247261    
## x7           3.316e+00  1.850e-01  17.928  < 2e-16 ***
## x8           1.459e-01  4.332e-02   3.367 0.000764 ***
## x9           8.738e-01  9.643e-02   9.062  < 2e-16 ***
## x10          3.455e-01  8.987e-02   3.844 0.000122 ***
## x11          7.188e+07  2.150e+07   3.342 0.000836 ***
## x12         -2.805e-02  5.463e-02  -0.514 0.607593    
## x13          1.035e-02  2.171e-02   0.477 0.633730    
## x14         -1.508e-01  9.388e-02  -1.606 0.108301    
## x15         -6.076e-02  8.991e-02  -0.676 0.499167    
## x16          3.394e-01  6.248e-02   5.433 5.76e-08 ***
## x17          3.983e-01  9.437e-02   4.221 2.47e-05 ***
## x18          1.661e+00  6.659e-02  24.949  < 2e-16 ***
## x19          2.794e-02  4.799e-02   0.582 0.560433    
## x20         -2.343e-01  3.338e-01  -0.702 0.482810    
## x21          3.772e-02  1.237e-02   3.049 0.002305 ** 
## x22         -1.623e-02  1.006e-01  -0.161 0.871817    
## x23         -1.400e-02  9.589e-02  -0.146 0.883918    
## stat1       -4.559e-02  7.223e-02  -0.631 0.527927    
## stat2       -2.650e-02  7.198e-02  -0.368 0.712820    
## stat3        1.420e-01  7.268e-02   1.954 0.050796 .  
## stat4       -9.102e-02  7.284e-02  -1.250 0.211526    
## stat5       -3.402e-02  7.228e-02  -0.471 0.637858    
## stat6       -7.023e-02  7.232e-02  -0.971 0.331530    
## stat7       -1.260e-02  7.213e-02  -0.175 0.861289    
## stat8        4.356e-02  7.200e-02   0.605 0.545226    
## stat9       -1.789e-02  7.179e-02  -0.249 0.803224    
## stat10      -1.238e-01  7.182e-02  -1.724 0.084824 .  
## stat11      -4.192e-02  7.269e-02  -0.577 0.564185    
## stat12       4.300e-02  7.198e-02   0.597 0.550204    
## stat13      -1.272e-01  7.135e-02  -1.783 0.074708 .  
## stat14      -2.366e-01  7.195e-02  -3.289 0.001012 ** 
## stat15      -4.286e-02  7.170e-02  -0.598 0.550049    
## stat16      -1.129e-02  7.180e-02  -0.157 0.875027    
## stat17      -8.145e-02  7.167e-02  -1.136 0.255826    
## stat18      -7.144e-02  7.167e-02  -0.997 0.318904    
## stat19       4.091e-02  7.189e-02   0.569 0.569324    
## stat20      -3.753e-02  7.195e-02  -0.522 0.601956    
## stat21      -1.594e-02  7.268e-02  -0.219 0.826365    
## stat22      -1.089e-01  7.235e-02  -1.506 0.132163    
## stat23       1.806e-01  7.198e-02   2.509 0.012130 *  
## stat24      -1.677e-01  7.204e-02  -2.327 0.019980 *  
## stat25      -1.343e-01  7.183e-02  -1.870 0.061472 .  
## stat26      -8.663e-02  7.207e-02  -1.202 0.229386    
## stat27       1.452e-02  7.208e-02   0.201 0.840387    
## stat28       2.085e-02  7.226e-02   0.289 0.772938    
## stat29       1.210e-01  7.268e-02   1.665 0.095993 .  
## stat30       8.621e-02  7.303e-02   1.181 0.237849    
## stat31      -3.783e-02  7.270e-02  -0.520 0.602818    
## stat32       7.471e-02  7.266e-02   1.028 0.303914    
## stat33      -1.148e-01  7.201e-02  -1.594 0.111010    
## stat34       4.494e-02  7.220e-02   0.622 0.533686    
## stat35      -1.122e-01  7.200e-02  -1.558 0.119303    
## stat36       4.093e-02  7.141e-02   0.573 0.566604    
## stat37      -1.190e-01  7.247e-02  -1.643 0.100501    
## stat38       1.107e-01  7.231e-02   1.531 0.125747    
## stat39      -7.847e-03  7.145e-02  -0.110 0.912544    
## stat40       6.254e-02  7.201e-02   0.868 0.385182    
## stat41      -1.538e-01  7.142e-02  -2.154 0.031316 *  
## stat42      -1.341e-01  7.167e-02  -1.871 0.061454 .  
## stat43      -9.789e-02  7.247e-02  -1.351 0.176813    
## stat44       4.461e-02  7.165e-02   0.623 0.533544    
## stat45      -6.899e-02  7.165e-02  -0.963 0.335654    
## stat46       1.034e-01  7.194e-02   1.438 0.150519    
## stat47       3.190e-02  7.245e-02   0.440 0.659713    
## stat48       5.232e-02  7.230e-02   0.724 0.469336    
## stat49       1.245e-01  7.138e-02   1.744 0.081258 .  
## stat50       3.205e-02  7.158e-02   0.448 0.654381    
## stat51       1.323e-01  7.214e-02   1.834 0.066767 .  
## stat52      -2.793e-02  7.256e-02  -0.385 0.700286    
## stat53      -4.729e-02  7.272e-02  -0.650 0.515498    
## stat54      -1.188e-01  7.261e-02  -1.636 0.101882    
## stat55       9.055e-02  7.123e-02   1.271 0.203660    
## stat56      -4.853e-02  7.194e-02  -0.675 0.500001    
## stat57      -2.709e-02  7.155e-02  -0.379 0.705017    
## stat58      -6.057e-02  7.155e-02  -0.847 0.397271    
## stat59      -8.178e-03  7.216e-02  -0.113 0.909777    
## stat60       1.514e-01  7.213e-02   2.099 0.035904 *  
## stat61       4.074e-02  7.236e-02   0.563 0.573506    
## stat62      -7.502e-02  7.158e-02  -1.048 0.294639    
## stat63       6.351e-02  7.198e-02   0.882 0.377673    
## stat64      -7.909e-02  7.176e-02  -1.102 0.270485    
## stat65      -1.305e-01  7.252e-02  -1.800 0.071951 .  
## stat66       8.899e-02  7.291e-02   1.220 0.222339    
## stat67       2.169e-02  7.237e-02   0.300 0.764442    
## stat68      -2.725e-02  7.206e-02  -0.378 0.705370    
## stat69      -5.561e-02  7.175e-02  -0.775 0.438394    
## stat70       1.145e-01  7.179e-02   1.595 0.110863    
## stat71      -3.144e-02  7.157e-02  -0.439 0.660442    
## stat72       3.687e-02  7.230e-02   0.510 0.610097    
## stat73       1.103e-01  7.214e-02   1.529 0.126217    
## stat74      -9.601e-02  7.226e-02  -1.329 0.184028    
## stat75       1.414e-02  7.267e-02   0.195 0.845736    
## stat76       6.563e-02  7.189e-02   0.913 0.361283    
## stat77       1.651e-02  7.221e-02   0.229 0.819150    
## stat78      -2.063e-02  7.206e-02  -0.286 0.774694    
## stat79      -3.536e-02  7.246e-02  -0.488 0.625533    
## stat80       2.830e-02  7.285e-02   0.389 0.697640    
## stat81       8.520e-02  7.227e-02   1.179 0.238482    
## stat82       3.443e-03  7.184e-02   0.048 0.961781    
## stat83      -1.189e-02  7.195e-02  -0.165 0.868770    
## stat84       1.453e-02  7.229e-02   0.201 0.840716    
## stat85       8.610e-03  7.213e-02   0.119 0.904994    
## stat86       3.014e-02  7.215e-02   0.418 0.676172    
## stat87      -3.122e-02  7.250e-02  -0.431 0.666747    
## stat88      -1.306e-02  7.147e-02  -0.183 0.854992    
## stat89      -5.852e-02  7.173e-02  -0.816 0.414655    
## stat90      -4.459e-02  7.218e-02  -0.618 0.536812    
## stat91      -1.601e-01  7.166e-02  -2.234 0.025505 *  
## stat92      -9.821e-02  7.194e-02  -1.365 0.172267    
## stat93      -3.398e-02  7.293e-02  -0.466 0.641298    
## stat94      -1.970e-02  7.197e-02  -0.274 0.784301    
## stat95      -4.110e-02  7.195e-02  -0.571 0.567816    
## stat96      -6.225e-02  7.241e-02  -0.860 0.390005    
## stat97      -7.469e-03  7.136e-02  -0.105 0.916636    
## stat98       9.945e-01  7.121e-02  13.966  < 2e-16 ***
## stat99       9.422e-02  7.220e-02   1.305 0.191954    
## stat100      1.982e-01  7.239e-02   2.739 0.006189 ** 
## stat101     -8.971e-02  7.312e-02  -1.227 0.219955    
## stat102      4.520e-02  7.221e-02   0.626 0.531434    
## stat103     -3.401e-02  7.326e-02  -0.464 0.642512    
## stat104     -6.126e-02  7.244e-02  -0.846 0.397783    
## stat105      7.291e-02  7.152e-02   1.019 0.308043    
## stat106     -5.050e-02  7.180e-02  -0.703 0.481828    
## stat107     -3.187e-02  7.232e-02  -0.441 0.659437    
## stat108     -6.683e-02  7.209e-02  -0.927 0.354004    
## stat109      2.018e-02  7.218e-02   0.280 0.779809    
## stat110     -9.673e-01  7.155e-02 -13.519  < 2e-16 ***
## stat111      4.178e-02  7.245e-02   0.577 0.564185    
## stat112     -9.160e-02  7.253e-02  -1.263 0.206666    
## stat113      9.453e-03  7.264e-02   0.130 0.896463    
## stat114      5.242e-02  7.229e-02   0.725 0.468432    
## stat115      3.961e-02  7.178e-02   0.552 0.581074    
## stat116      7.419e-02  7.244e-02   1.024 0.305797    
## stat117      4.926e-02  7.228e-02   0.681 0.495583    
## stat118     -4.218e-02  7.186e-02  -0.587 0.557227    
## stat119      1.669e-02  7.215e-02   0.231 0.817063    
## stat120      1.605e-02  7.169e-02   0.224 0.822863    
## stat121     -1.236e-02  7.214e-02  -0.171 0.863993    
## stat122     -2.769e-02  7.198e-02  -0.385 0.700541    
## stat123      7.683e-02  7.339e-02   1.047 0.295178    
## stat124     -2.495e-02  7.216e-02  -0.346 0.729552    
## stat125      8.646e-02  7.234e-02   1.195 0.232074    
## stat126      9.590e-02  7.179e-02   1.336 0.181645    
## stat127     -2.855e-03  7.175e-02  -0.040 0.968265    
## stat128     -6.034e-02  7.229e-02  -0.835 0.403927    
## stat129     -5.230e-03  7.208e-02  -0.073 0.942153    
## stat130      3.353e-02  7.221e-02   0.464 0.642464    
## stat131     -5.837e-02  7.220e-02  -0.808 0.418842    
## stat132     -2.335e-02  7.169e-02  -0.326 0.744666    
## stat133      3.967e-02  7.216e-02   0.550 0.582499    
## stat134     -3.905e-02  7.170e-02  -0.545 0.586026    
## stat135     -3.763e-02  7.218e-02  -0.521 0.602112    
## stat136      4.740e-02  7.222e-02   0.656 0.511655    
## stat137     -2.850e-02  7.198e-02  -0.396 0.692120    
## stat138      5.991e-03  7.182e-02   0.083 0.933528    
## stat139     -3.735e-02  7.222e-02  -0.517 0.605050    
## stat140     -3.976e-02  7.180e-02  -0.554 0.579813    
## stat141      5.683e-02  7.162e-02   0.793 0.427524    
## stat142     -4.771e-02  7.292e-02  -0.654 0.512962    
## stat143      3.615e-03  7.176e-02   0.050 0.959826    
## stat144      1.395e-01  7.146e-02   1.952 0.051005 .  
## stat145     -1.740e-02  7.281e-02  -0.239 0.811102    
## stat146     -9.462e-02  7.275e-02  -1.301 0.193457    
## stat147      2.570e-02  7.284e-02   0.353 0.724245    
## stat148     -6.022e-02  7.113e-02  -0.847 0.397193    
## stat149     -1.582e-01  7.302e-02  -2.167 0.030266 *  
## stat150     -2.945e-02  7.216e-02  -0.408 0.683185    
## stat151     -1.489e-01  7.323e-02  -2.033 0.042139 *  
## stat152     -1.146e-01  7.207e-02  -1.591 0.111748    
## stat153      4.537e-02  7.260e-02   0.625 0.531993    
## stat154     -1.358e-02  7.272e-02  -0.187 0.851826    
## stat155     -5.099e-02  7.195e-02  -0.709 0.478562    
## stat156      1.594e-01  7.248e-02   2.199 0.027902 *  
## stat157     -1.463e-02  7.175e-02  -0.204 0.838401    
## stat158      5.818e-03  7.337e-02   0.079 0.936801    
## stat159     -2.885e-03  7.195e-02  -0.040 0.968013    
## stat160      3.370e-02  7.218e-02   0.467 0.640598    
## stat161      7.736e-02  7.299e-02   1.060 0.289272    
## stat162      4.477e-03  7.178e-02   0.062 0.950268    
## stat163     -1.947e-04  7.268e-02  -0.003 0.997863    
## stat164      6.639e-02  7.257e-02   0.915 0.360328    
## stat165     -5.776e-03  7.171e-02  -0.081 0.935811    
## stat166     -8.617e-02  7.131e-02  -1.208 0.226933    
## stat167     -1.795e-02  7.185e-02  -0.250 0.802769    
## stat168     -3.778e-02  7.187e-02  -0.526 0.599105    
## stat169      4.980e-03  7.181e-02   0.069 0.944716    
## stat170     -5.207e-02  7.248e-02  -0.718 0.472482    
## stat171     -1.075e-02  7.282e-02  -0.148 0.882621    
## stat172      5.985e-02  7.210e-02   0.830 0.406498    
## stat173     -1.956e-02  7.224e-02  -0.271 0.786566    
## stat174      2.097e-02  7.237e-02   0.290 0.771973    
## stat175     -5.772e-02  7.248e-02  -0.796 0.425870    
## stat176      2.057e-02  7.172e-02   0.287 0.774213    
## stat177     -4.565e-03  7.280e-02  -0.063 0.949999    
## stat178     -2.444e-02  7.311e-02  -0.334 0.738148    
## stat179      4.643e-02  7.191e-02   0.646 0.518480    
## stat180      1.317e-02  7.175e-02   0.184 0.854410    
## stat181      6.927e-02  7.268e-02   0.953 0.340576    
## stat182     -3.640e-02  7.278e-02  -0.500 0.616987    
## stat183      3.659e-02  7.147e-02   0.512 0.608708    
## stat184     -8.906e-03  7.281e-02  -0.122 0.902655    
## stat185     -5.439e-02  7.143e-02  -0.761 0.446426    
## stat186     -5.383e-02  7.241e-02  -0.743 0.457281    
## stat187     -1.589e-01  7.143e-02  -2.225 0.026125 *  
## stat188      1.925e-02  7.178e-02   0.268 0.788555    
## stat189     -2.989e-02  7.196e-02  -0.415 0.677890    
## stat190      2.635e-02  7.200e-02   0.366 0.714373    
## stat191     -8.420e-03  7.227e-02  -0.116 0.907261    
## stat192      5.684e-02  7.291e-02   0.780 0.435687    
## stat193     -2.319e-02  7.275e-02  -0.319 0.749874    
## stat194     -5.032e-02  7.172e-02  -0.702 0.482993    
## stat195      9.792e-02  7.211e-02   1.358 0.174560    
## stat196      1.555e-02  7.268e-02   0.214 0.830588    
## stat197      2.652e-02  7.123e-02   0.372 0.709692    
## stat198     -1.067e-01  7.229e-02  -1.476 0.140091    
## stat199      1.164e-02  7.179e-02   0.162 0.871218    
## stat200     -1.375e-01  7.144e-02  -1.925 0.054266 .  
## stat201     -1.319e-02  7.225e-02  -0.183 0.855161    
## stat202     -8.990e-02  7.289e-02  -1.233 0.217460    
## stat203      2.161e-02  7.202e-02   0.300 0.764129    
## stat204     -1.287e-01  7.196e-02  -1.789 0.073681 .  
## stat205     -9.291e-02  7.161e-02  -1.297 0.194545    
## stat206     -4.570e-02  7.285e-02  -0.627 0.530471    
## stat207      1.125e-01  7.261e-02   1.549 0.121368    
## stat208     -2.075e-02  7.268e-02  -0.285 0.775278    
## stat209      2.144e-02  7.210e-02   0.297 0.766240    
## stat210     -2.033e-03  7.197e-02  -0.028 0.977462    
## stat211     -6.263e-02  7.171e-02  -0.873 0.382559    
## stat212     -1.068e-02  7.197e-02  -0.148 0.881994    
## stat213     -2.998e-02  7.221e-02  -0.415 0.678022    
## stat214     -1.414e-01  7.202e-02  -1.964 0.049616 *  
## stat215     -1.225e-01  7.220e-02  -1.697 0.089773 .  
## stat216     -5.887e-02  7.198e-02  -0.818 0.413500    
## stat217      4.466e-02  7.221e-02   0.619 0.536239    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.477 on 5761 degrees of freedom
## Multiple R-squared:  0.2376, Adjusted R-squared:  0.2058 
## F-statistic: 7.479 on 240 and 5761 DF,  p-value: < 2.2e-16
plot.diagnostics(model.full, data.train)

model.null = lm(grand.mean.formula, data.train)
summary(model.null)
## 
## Call:
## lm(formula = grand.mean.formula, data = data.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.436  -7.039  -1.384   5.620  61.624 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 125.3483     0.1373   913.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.63 on 6001 degrees of freedom
plot.diagnostics(model.null, data.train)

## hat values (leverages) are all = 0.0001666111
##  and there are no factor predictors; no plot no. 5

Variable Selection

http://www.stat.columbia.edu/~martin/W2024/R10.pdf

Forward Selection

if (algo.forward == TRUE){
  t1 = Sys.time()
  
  model.forward = step(model.null, scope=list(lower=model.null, upper=model.full), direction="forward")
  print(summary(model.forward))
  saveRDS(model.forward,file = "model_forward.rds")
  
  t2 = Sys.time()
  print (paste("Time taken for Forward Selection: ",t2-t1, sep = ""))
  
  plot.diagnostics(model.forward, data.train)
  
}

Backward Elimination

if (algo.backward == TRUE){
  # Takes too much time
  t1 = Sys.time()
  
  model.backward = step(model.full, data = data.train, direction="backward")
  print(summary(model.backward))
  saveRDS(model.forward,file = "model_backward.rds")
  
  t2 = Sys.time()
  print (paste("Time taken for Backward Elimination: ",t2-t1, sep = ""))
  
  plot.diagnostics(model.backward, data.train)
}

Stepwise Selection

if (algo.stepwise == TRUE){
  t1 = Sys.time()
  
  model.stepwise = step(model.null, scope=list(upper=model.full), data = data.train, direction="both")
  print(summary(model.stepwise))
  saveRDS(model.forward,file = "model_stepwise.rds")
  
  t2 = Sys.time()
  print (paste("Time taken for Stepwise Selection: ",t2-t1, sep = ""))
  
  plot.diagnostics(model.stepwise, data.train)
}

LASSO Selection

if (algo.LASSO == TRUE){
  t1 = Sys.time()

  model.LASSO = cv.glmnet(as.matrix(data.train[,feature.names]), data.train[,label.names], nfolds = 5, standardize = TRUE)  
  summary(model.LASSO)
  
  t2 = Sys.time()
  print (paste("Time taken for LASSO: ",t2-t1, sep = ""))
  
  plot(model.LASSO)
  best_lambda = model.LASSO$lambda.1se
  lasso_coef = model.LASSO$glmnet.fit$beta[ , model.LASSO$glmnet.fit$lambda == best_lambda]
  print (lasso_coef)
  lasso_coef [ abs(lasso_coef) > 0 ]
}
# summary(model.forward)
# summary(model.stepwise)